{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "265eb140-c075-47b9-b4b2-3453faa2d5cb",
   "metadata": {},
   "source": [
    "### Parsing VEP most severe consequence "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "725717b4-0d96-4653-9fe6-8b540728e713",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b22d7bc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3fcf9876",
   "metadata": {},
   "outputs": [],
   "source": [
    "vep_msc_in = wkdir_path.joinpath(\"4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_output.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5911e4c3-90e6-426f-8d4e-d3e6ccf4a6ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = [\"Uploaded_variation\", \"Location\", \"Allele\", \"Gene\", \"Feature\", \n",
    "           \"Feature_type\", \"Consequence\", \"cDNA_position\", \"CDS_position\",\n",
    "           \"Protein_position\", \"Amino_acids\", \"Codons\", \"Existing_variation\",\n",
    "           \"Extra\"]\n",
    "vrnt_msc_df = pd.DataFrame(columns=columns)\n",
    "\n",
    "with open(vep_msc_in, 'r') as vep_msc: \n",
    "    for line in vep_msc: \n",
    "        if line.startswith('#'): \n",
    "            continue \n",
    "        vrnt_msc = line.strip().split('\\t')\n",
    "        vrnt_msc_df.loc[len(vrnt_msc_df)] = vrnt_msc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "82efa6f2-9559-4b40-92c8-f6f849846c70",
   "metadata": {},
   "outputs": [],
   "source": [
    "# keep columns that have entries\n",
    "cols_to_keep = []\n",
    "for col in columns: \n",
    "    if len(vrnt_msc_df[col].unique()) != 1: \n",
    "        cols_to_keep.append(col)\n",
    "vrnt_msc_trunc_df = vrnt_msc_df[cols_to_keep]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "007fae83-0ba1-4325-b9ec-ae7db8a9ccdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "vep_msc_out= wkdir_path.joinpath(\"4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_parsed.tsv\")\n",
    "vrnt_msc_trunc_df.to_csv(vep_msc_out, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "212fa7a7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
